2장 - 무작위 실험 및 기초 통계 리뷰

2.1 무작위 배정으로 독립성 확보하기

2.2 A/B 테스트 사례

실제 산업 현장에서 무작위 실험은 A/B 테스트라는 이름으로 널리 사용됩니다. 새로운 기능을 도입하기 전, 일부 사용자에게만 노출하여 효과를 정밀하게 측정합니다.

import pandas as pd

pd.set_option("display.max_rows", 5)
import pandas as pd  # for data manipulation
import numpy as np  # for numerical computation

data = pd.read_csv("../data/cross_sell_email.csv")
data
gender cross_sell_email age conversion
0 0 short 15 0
1 1 short 27 0
... ... ... ... ...
321 1 no_email 16 0
322 1 long 24 1

323 rows × 4 columns

(data.groupby(["cross_sell_email"]).mean())
gender age conversion
cross_sell_email
long 0.550459 21.752294 0.055046
no_email 0.542553 20.489362 0.042553
short 0.633333 20.991667 0.125000
X = ["gender", "age"]

mu = data.groupby("cross_sell_email")[X].mean()
var = data.groupby("cross_sell_email")[X].var()

norm_diff = (mu - mu.loc["no_email"]) / np.sqrt((var + var.loc["no_email"]) / 2)

norm_diff
gender age
cross_sell_email
long 0.015802 0.221423
no_email 0.000000 0.000000
short 0.184341 0.087370

2.3 이상적인 실험

이상적인 실험에서는 모든 외생 변수가 통제되며, 오직 처치 변수만이 결과에 영향을 미칩니다.

2.4 가장 위험한 수식

import warnings

warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
from scipy import stats
import seaborn as sns
from matplotlib import pyplot as plt
from cycler import cycler
import matplotlib

default_cycler = cycler(color=["0.1", "0.5", "1.0"])

color = ["0.3", "0.5", "0.7", "0.9"]
linestyle = ["-", "--", ":", "-."]
marker = ["o", "v", "d", "p"]

plt.rc("axes", prop_cycle=default_cycler)

matplotlib.rcParams.update({"font.size": 18})
df = pd.read_csv("data/enem_scores.csv")
df.sort_values(by="avg_score", ascending=False).head(10)
year school_id number_of_students avg_score
16670 2007 33062633 68 82.97
16796 2007 33065403 172 82.04
... ... ... ... ...
14636 2007 31311723 222 79.41
17318 2007 33087679 210 79.38

10 rows × 4 columns

plot_data = df.assign(top_school=df["avg_score"] >= np.quantile(df["avg_score"], 0.99))[
    ["top_school", "number_of_students"]
].query(
    f"number_of_students<{np.quantile(df['number_of_students'], 0.98)}"
)  # remove outliers

plt.figure(figsize=(8, 4))
ax = sns.boxplot(x="top_school", y="number_of_students", data=plot_data)

plt.title("Number of Students of 1% Top Schools (Right)")
Text(0.5, 1.0, 'Number of Students of 1% Top Schools (Right)')

q_99 = np.quantile(df["avg_score"], 0.99)
q_01 = np.quantile(df["avg_score"], 0.01)

plot_data = df.sample(10000).assign(
    Group=lambda d: np.select(
        [(d["avg_score"] > q_99) | (d["avg_score"] < q_01)],
        ["Top and Bottom"],
        "Middle",
    )
)
plt.figure(figsize=(10, 5))
sns.scatterplot(
    y="avg_score",
    x="number_of_students",
    data=plot_data.query("Group=='Middle'"),
    label="Middle",
)
ax = sns.scatterplot(
    y="avg_score",
    x="number_of_students",
    data=plot_data.query("Group!='Middle'"),
    color="0.7",
    label="Top and Bottom",
)

plt.title("ENEM Score by Number of Students in the School")
Text(0.5, 1.0, 'ENEM Score by Number of Students in the School')

2.5 추정값의 표준오차

추정된 처치 효과가 우연에 의한 것인지 아닌지를 판단하기 위해 표준오차(Standard Error)를 계산합니다. 표본의 크기가 클수록 추정값의 불확실성은 줄어듭니다.

data = pd.read_csv("../data/cross_sell_email.csv")

short_email = data.query("cross_sell_email=='short'")["conversion"]
long_email = data.query("cross_sell_email=='long'")["conversion"]
email = data.query("cross_sell_email!='no_email'")["conversion"]
no_email = data.query("cross_sell_email=='no_email'")["conversion"]

data.groupby("cross_sell_email").size()
cross_sell_email
long        109
no_email     94
short       120
dtype: int64
def se(y: pd.Series):
    return y.std() / np.sqrt(len(y))


print("SE for Long Email:", se(long_email))
print("SE for Short Email:", se(short_email))
SE for Long Email: 0.021946024609185506
SE for Short Email: 0.030316953129541618
print("SE for Long Email:", long_email.sem())
print("SE for Short Email:", short_email.sem())
SE for Long Email: 0.021946024609185506
SE for Short Email: 0.030316953129541618

2.6 신뢰구간

n = 100
conv_rate = 0.08


def run_experiment():
    return np.random.binomial(1, conv_rate, size=n)


np.random.seed(42)

experiments = [run_experiment().mean() for _ in range(10000)]
plt.figure(figsize=(10, 4))
freq, bins, img = plt.hist(experiments, bins=20, label="Experiment Means", color="0.6")
plt.vlines(
    conv_rate,
    ymin=0,
    ymax=freq.max(),
    linestyles="dashed",
    label="True Mean",
    color="0.3",
)
plt.legend()
<matplotlib.legend.Legend at 0x7fd451a4bc10>

np.random.seed(42)
plt.figure(figsize=(10, 4))
plt.hist(np.random.binomial(1, 0.08, 100), bins=20)
(array([92.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  8.]),
 array([0.  , 0.05, 0.1 , 0.15, 0.2 , 0.25, 0.3 , 0.35, 0.4 , 0.45, 0.5 ,
        0.55, 0.6 , 0.65, 0.7 , 0.75, 0.8 , 0.85, 0.9 , 0.95, 1.  ]),
 <BarContainer object of 20 artists>)

x = np.linspace(-4, 4, 100)
y = stats.norm.pdf(x, 0, 1)

plt.figure(figsize=(10, 4))
plt.plot(x, y, linestyle="solid")
plt.fill_between(x.clip(-3, +3), 0, y, alpha=0.5, label="~99.7% mass", color="C2")
plt.fill_between(x.clip(-2, +2), 0, y, alpha=0.5, label="~95% mass", color="C1")
plt.fill_between(x.clip(-1, +1), 0, y, alpha=0.5, label="~68% mass", color="C0")
plt.ylabel("Density")
plt.legend()
<matplotlib.legend.Legend at 0x7fd451c295d0>

exp_se = short_email.sem()
exp_mu = short_email.mean()
ci = (exp_mu - 2 * exp_se, exp_mu + 2 * exp_se)
print("95% CI for Short Email: ", ci)
95% CI for Short Email:  (0.06436609374091676, 0.18563390625908324)
x = np.linspace(exp_mu - 4 * exp_se, exp_mu + 4 * exp_se, 100)
y = stats.norm.pdf(x, exp_mu, exp_se)

plt.figure(figsize=(10, 4))
plt.plot(x, y, lw=3)
plt.vlines(ci[1], ymin=0, ymax=4, ls="dotted")
plt.vlines(ci[0], ymin=0, ymax=4, ls="dotted", label="95% CI")
plt.xlabel("Conversion")
plt.legend()
<matplotlib.legend.Legend at 0x7fd46289cdd0>

from scipy import stats

z = np.abs(stats.norm.ppf((1 - 0.99) / 2))
print(z)
ci = (exp_mu - z * exp_se, exp_mu + z * exp_se)
ci
2.5758293035489004
(0.04690870373460816, 0.20309129626539185)
stats.norm.ppf((1 - 0.99) / 2)
-2.5758293035489004
x = np.linspace(exp_mu - 4 * exp_se, exp_mu + 4 * exp_se, 100)
y = stats.norm.pdf(x, exp_mu, exp_se)

plt.figure(figsize=(10, 4))
plt.plot(x, y, lw=3)
plt.vlines(ci[1], ymin=0, ymax=4, ls="dotted")
plt.vlines(ci[0], ymin=0, ymax=4, ls="dotted", label="99% CI")


ci_95 = (exp_mu - 1.96 * exp_se, exp_mu + 1.96 * exp_se)

plt.vlines(ci_95[1], ymin=0, ymax=4, ls="dashed")
plt.vlines(ci_95[0], ymin=0, ymax=4, ls="dashed", label="95% CI")
plt.xlabel("Conversion")
plt.legend()
<matplotlib.legend.Legend at 0x7fd462983b50>

def ci(y: pd.Series):
    return (y.mean() - 2 * y.sem(), y.mean() + 2 * y.sem())


print("95% CI for Short Email:", ci(short_email))
print("95% CI for Long Email:", ci(long_email))
print("95% CI for No Email:", ci(no_email))
95% CI for Short Email: (0.06436609374091676, 0.18563390625908324)
95% CI for Long Email: (0.01115382234126202, 0.09893792077800403)
95% CI for No Email: (0.0006919679286838468, 0.08441441505003955)
plt.figure(figsize=(10, 4))

x = np.linspace(-0.05, 0.25, 100)
short_dist = stats.norm.pdf(x, short_email.mean(), short_email.sem())
plt.plot(x, short_dist, lw=2, label="Short", linestyle=linestyle[0])
plt.fill_between(
    x.clip(ci(short_email)[0], ci(short_email)[1]),
    0,
    short_dist,
    alpha=0.2,
    color="0.0",
)

long_dist = stats.norm.pdf(x, long_email.mean(), long_email.sem())
plt.plot(x, long_dist, lw=2, label="Long", linestyle=linestyle[1])
plt.fill_between(
    x.clip(ci(long_email)[0], ci(long_email)[1]), 0, long_dist, alpha=0.2, color="0.4"
)

no_email_dist = stats.norm.pdf(x, no_email.mean(), no_email.sem())
plt.plot(x, no_email_dist, lw=2, label="No email", linestyle=linestyle[2])
plt.fill_between(
    x.clip(ci(no_email)[0], ci(no_email)[1]), 0, no_email_dist, alpha=0.2, color="0.8"
)

plt.xlabel("Conversion")
plt.legend()
<matplotlib.legend.Legend at 0x7fd451a96810>

2.7 가설검정

import seaborn as sns
from matplotlib import pyplot as plt

np.random.seed(123)

n1 = np.random.normal(4, 3, 30000)
n2 = np.random.normal(1, 4, 30000)
n_diff = n2 - n1

plt.figure(figsize=(10, 4))
sns.distplot(
    n1, hist=False, label="$N(4,3^2)$", color="0.0", kde_kws={"linestyle": linestyle[0]}
)
sns.distplot(
    n2, hist=False, label="$N(1,4^2)$", color="0.4", kde_kws={"linestyle": linestyle[1]}
)
sns.distplot(
    n_diff,
    hist=False,
    label="$N(-3, 5^2) = N(1,4^2) - (4,3^2)$",
    color="0.8",
    kde_kws={"linestyle": linestyle[1]},
)
plt.legend();

diff_mu = short_email.mean() - no_email.mean()
diff_se = np.sqrt(no_email.sem() ** 2 + short_email.sem() ** 2)

ci = (diff_mu - 1.96 * diff_se, diff_mu + 1.96 * diff_se)
print(f"95% CI for the differece (short email - no email):\n{ci}")
95% CI for the differece (short email - no email):
(0.01023980847439844, 0.15465380854687816)
x = np.linspace(diff_mu - 4 * diff_se, diff_mu + 4 * diff_se, 100)
y = stats.norm.pdf(x, diff_mu, diff_se)

plt.figure(figsize=(10, 3))
plt.plot(x, y, lw=3)
plt.vlines(ci[1], ymin=0, ymax=4, ls="dotted")
plt.vlines(ci[0], ymin=0, ymax=4, ls="dotted", label="95% CI")
plt.xlabel("Diff. in Conversion (Short - No Email)\n")
plt.legend()
plt.subplots_adjust(bottom=0.15)

2.7.1 귀무가설

# shifting the CI
diff_mu_shifted = short_email.mean() - no_email.mean() - 0.01
diff_se = np.sqrt(no_email.sem() ** 2 + short_email.sem() ** 2)

ci = (diff_mu_shifted - 1.96 * diff_se, diff_mu_shifted + 1.96 * diff_se)
print(f"95% CI 1% difference between (short email - no email):\n{ci}")
95% CI 1% difference between (short email - no email):
(0.00023980847439844521, 0.14465380854687815)

2.7.2 검정통계량

t_stat = (diff_mu - 0) / diff_se
t_stat
2.2379512318715364

2.8 p 값

x = np.linspace(-4, 4, 100)
y = stats.norm.pdf(x, 0, 1)

plt.figure(figsize=(10, 4))
plt.plot(x, y, lw=2)
plt.vlines(t_stat, ymin=0, ymax=0.1, ls="dotted", label="T-Stat", lw=2)
plt.fill_between(x.clip(t_stat), 0, y, alpha=0.4, label="P-value")
plt.legend()
<matplotlib.legend.Legend at 0x7fd462fd6650>

print("P-value:", (1 - stats.norm.cdf(t_stat)) * 2)
P-value: 0.025224235562152142

2.9 검정력

stats.norm.cdf(0.84)
0.7995458067395503

2.10 표본 크기 계산

# in the book it is np.ceil(16 * no_email.std()**2/0.01), but it is missing the **2 in the denominator.
np.ceil(16 * (no_email.std() / 0.08) ** 2)
103.0
data.groupby("cross_sell_email").size()
cross_sell_email
long        109
no_email     94
short       120
dtype: int64

2.11 요약